%load_ext autoreload
%autoreload 2
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sys
sys.path.append("../../")
DATA_PATH = Path('../../../data/')
MODEL_PATH = Path('../../../models/keras2lwtnn')
import cufflinks as cf
import plotly.offline as py
py.init_notebook_mode()
cf.go_offline()
import plotly.graph_objs as go
import os
k = 1
sampled_data = pd.read_hdf(DATA_PATH / 'train_data.h5', 'train_set').sample(frac=k, random_state=137)
unused_features = [
'seed_nbIT', 'seed_nLayers', 'seed_mva_value', 'seed_nLHCbIDs',
'is_downstream_reconstructible_not_electron', 'is_true_seed',
'has_MCParticle_not_electron', 'has_MCParticle'
]
data = sampled_data.drop(unused_features, axis=1)
data = data.astype(np.float32)
data.info()
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=137)
label_names = [
'is_downstream_reconstructible'
]
x_train = train_set.drop(label_names, axis=1)
y_train = train_set[label_names].copy().astype(np.int32)
x_test = test_set.drop(label_names, axis=1)
y_test = test_set[label_names].copy().astype(np.int32)
x_test.head()
from data_pipeline import data_pipeline_all_labels as data_pipeline
pipeline = data_pipeline()
x_train = pipeline.fit_transform(x_train)
x_test = pipeline.transform(x_test)
x_test.head()
x_train.sample(frac=0.01).iplot(kind='hist', bins=50)
data_renamed_columns = {column:column.replace("_", "\_") for column in x_train.columns}
plt.rc('text', usetex=True)
plt.rc('font', family='serif')
x_train.rename(columns=data_renamed_columns).hist(bins=50, figsize=(15,15))
# plt.savefig('features_scaled_hist.eps')
x_train.hist(bins=50, figsize=(10,10))
x_train.mean()
x_test.mean()
# x_test.sample(frac=0.05).iplot(kind='hist', bins=50)
y_train.hist()
# y_test.hist()
from sklearn.utils import class_weight
class_weight = class_weight.compute_class_weight('balanced', np.unique(y_train['is_downstream_reconstructible']), y_train['is_downstream_reconstructible'])
from keras.callbacks import TensorBoard
from models import KerasDNN
import time
import keras
date_created = time.ctime()
DNNclf = KerasDNN(
(x_train.shape[1],),
(1,),
neurons=124,
layers=10,
dropout=0.05,
loss_metric='binary_crossentropy',
metrics=['accuracy'],
last_layer_act='sigmoid',
kernel_initializer='he_normal',
optimizer='adam',
batch_norm=False,
activation='selu',
)
DNNclf.fit(
x_train.values, y_train.values,
epochs=200,
validation_data=(x_test.values, y_test.values),
class_weight=class_weight,
batch_size=1000,
callbacks=[
TensorBoard(
log_dir=f'./logs/{date_created}',
histogram_freq=1,
batch_size=1000,
write_graph=True,
write_grads=True,
)
]
)
history = DNNclf.model.history
DNNclf.model.summary(line_length=100)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
# plt.savefig('loss.eps')
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='lower right')
# plt.savefig('acc.eps')
predictions = DNNclf.predict_proba(x_test.values)
predictions_df = pd.DataFrame(predictions, columns=y_test.columns, index=y_test.index)
predictions_df.head()
from sklearn.metrics import accuracy_score
%time accuracy_score(y_train.values, DNNclf.predict_proba(x_train.values) > 0.5)
%time accuracy_score(y_test.values, predictions_df.values> 0.5)
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train.values, DNNclf.predict_proba(x_train.values))
roc_auc_score(y_test.values, predictions_df.values)
from sklearn.metrics import log_loss
log_loss(y_train.values, DNNclf.predict_proba(x_train.values))
log_loss(y_test.values, DNNclf.predict_proba(x_test.values))
from utils import plot_roc_curve, plot_true_positives_and_negatives, plot_confusion_matrix
from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score
# test
plot_confusion_matrix(
confusion_matrix(y_test.values, np.array(predictions_df.values > 0.5, dtype=np.int32)),
classes=["ghost","seed"],
title='Normalized confusion matrix - DNN',
normalize=True
)
# train
plot_confusion_matrix(
confusion_matrix(y_train.values, np.array(DNNclf.model.predict(x_train.values) > 0.5, dtype=np.int32)),
classes=["ghost","seed"],
title='Normalized confusion matrix - DNN',
normalize=True
)
%autoreload 2
def plot_true_positives_and_negatives(
y_true,
probabilities,
normalize=False,
step=0.1,
title='True positives and true negatives vs threshold', ):
thresholds = np.arange(0.0, 1.0, step)
true_positives_rate = np.empty(thresholds.shape)
true_negatives_rate = np.empty(thresholds.shape)
for i, threshold in enumerate(thresholds):
classified_examples = np.array(
probabilities > threshold, dtype=int)
cm = confusion_matrix(y_true, classified_examples)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
true_positives_rate[i] = cm[1, 1]
true_negatives_rate[i] = cm[0, 0]
plots = [
go.Scatter(x=thresholds, y=true_positives_rate, name='true positives'),
go.Scatter(x=thresholds, y=true_negatives_rate, name='true negatives'),
]
layout = go.Layout(
title=title,
xaxis=dict(title='threshold'),
)
fig = go.Figure(data=plots, layout=layout)
py.iplot(fig)
return true_positives_rate, true_negatives_rate, thresholds
# train
true_positives_rate_tr, true_negatives_rate_tr, thresholds = plot_true_positives_and_negatives(
y_train.values, DNNclf.model.predict(x_train.values),
title='Thresholds - DNN',
step=1e-2,
normalize=True
)
# test
true_positives_rate, true_negatives_rate, thresholds = plot_true_positives_and_negatives(
y_test.values, predictions_df.values,
title='Thresholds - DNN',
step=5e-2,
normalize=True
)
plt.figure(figsize=(16,9))
plt.plot(thresholds, true_positives_rate_tr)
plt.plot(thresholds, true_negatives_rate_tr)
plt.plot(thresholds, true_positives_rate)
plt.plot(thresholds, true_negatives_rate)
plt.xlabel('Threshold')
plt.legend(['TP rate train', 'TN rate train', 'TP rate validation', 'TN rate validation'], loc='lower right')
true_positives_rate, true_negatives_rate, thresholds
x_train.mean()
x_test.mean()
predictions_df['correct'] = np.array( np.equal(y_test.values, predictions_df.values > 0.5))
pipeline.named_steps
def get_pipeline_params(pipeline):
d = {}
for name, step in pipeline.named_steps.items():
try:
d[step] = {'shift': -step.scaler.mean_[0], 'scale': 1.0/step.scaler.scale_[0]}
except Exception as e:
pass
return d
get_pipeline_params(pipeline)
model_arch = DNNclf.model.to_json()
model_dir = (MODEL_PATH / date_created).as_posix()
os.mkdir(model_dir)
with (MODEL_PATH / date_created / 'architecture.json').open('w') as arch_file:
arch_file.write(model_arch)
DNNclf.model.save_weights(MODEL_PATH / date_created / 'weights.h5')
print(date_created)